Notes:
library(ggplot2)
setwd('/Users/Yuji/Library/Mobile Documents/com~apple~CloudDocs/Course/Data Analysis with R')
pf <- read.csv("pseudo_facebook.tsv", sep = '\t')
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point()
Response: People under 30 tend to have much more friends than other age group. ***
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() +
xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13, 90)
## Warning: Removed 5186 rows containing missing values (geom_point).
Response: The noise is less and we can see the pattern in the group under 30. Most of the friends count of people under 30 is less than 1000. ***
Notes:
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13, 90) +
coord_trans(y = "sqrt") +
ylim(0, 5000)
## Warning: Removed 6142 rows containing missing values (geom_point).
The bottom line is not straight now, and the y scale is not equal. ***
Notes:
ggplot(aes(x=age, y = friendships_initiated), data = pf) +
geom_point(alpha = 1/10, position = position_jitter(h = 0))
Notes:
Notes:
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
Create your plot!
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(x=age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
geom_point(alpha = 1/20,
position = position_jitter(h = 0),
color = 'orange') +
geom_line(stat = "summary", fun.y = mean) +
geom_line(stat = "summary", fun.y = quantile, probs = 0.1,
linetype = 2, color = "blue") +
geom_line(stat = "summary", fun.y = quantile, probs = 0.5,
color = "blue") +
geom_line(stat = "summary", fun.y = quantile, probs = 0.9,
linetype = 2, color = "blue")
Response: The 10% quantile is the most flat line, other than that, other 3 lines have same trends. ***
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(x = pf$age, y = pf$friend_count)
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Notes:
Notes:
ggplot(aes(y = likes_received, x = www_likes_received), data = pf) +
geom_point(alpha = 1/10) +
coord_cartesian(ylim = c(0,quantile(pf$likes_received, 0.95)),
xlim = c(0, quantile(pf$www_likes_received, 0.95))) +
geom_smooth(method = "lm", color = "red")
Notes:
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
with(pf, cor.test(likes_received, www_likes_received))
##
## Pearson's product-moment correlation
##
## data: likes_received and www_likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Response:
Notes:
Notes:
library(alr3)
## Loading required package: car
data(Mitchell)
ggplot(aes(y = Temp, x = Month), data = Mitchell) +
geom_point()
Create your plot!
with(Mitchell, cor.test(Month, Temp))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place)
Notes:
ggplot(aes(y = Temp, x = Month), data = Mitchell) +
geom_point() +
scale_x_discrete(breaks = seq(0, 203, 12))
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
What do you notice? Response:
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
pf$age_with_months <- pf$age + (12-pf$dob_month)/12
Programming Assignment
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
group_age_months <- group_by(pf, age_with_months)
pf.fc_by_age_months <- summarise(group_age_months,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line()
Notes:
p1 <- ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = "summary", fun.y = "mean")
library(gridExtra)
## Loading required package: grid
grid.arrange(p1,p2,p3, ncol=1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Notes:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!